Roey Angel 2021-05-21
This analysis explores the taxonomical distribution patters in the different samples, based on the DADA2-produced sequences. Large parts of this script are based on this protocol and the accompanying publication by Callahan and colleagues (2016).
set.seed(1000)
min_lib_size <- 5000
metadata_path <- "./"
data_path <- "./DADA2_pseudo/"
Metadata_table <- "TeaTime_joint_Bacteria_metadata.csv"
Seq_table <- "DADA2.seqtab_nochim.tsv"
Tax_table <- "DADA2.taxa_silva.tsv"
Proj_name <- "TeaTime4Schools"Read abundance table, taxonomic classification and metadata into a phyloseq object. Also remove sequences detected as contaminants in 03_Decontamination.html.
# read OTU mat from data file
Raw_data <- read_tsv(paste0(data_path, Seq_table),
trim_ws = TRUE)
contaminant_seqs <- read_csv(paste0(data_path, "decontam_contaminants.csv"),
trim_ws = TRUE,
col_names = FALSE)
Raw_data %<>% # remove contaminant OTUs.
# .[, -grep("CTRL", colnames(.))] %>% # remove ext. cont.
.[!(Raw_data$`#OTU` %in% contaminant_seqs$X1), ]
Raw_data[, 2:ncol(Raw_data)] %>%
t() %>%
as.data.frame() -> abundance_mat # convert to abundance matrix
colnames(abundance_mat) <- pull(Raw_data, "#OTU") # add sequence names
# Read metadata file
read_csv(paste0(metadata_path, Metadata_table),
trim_ws = TRUE) %>%
mutate_at(
c(
"Workshop",
"Season",
"Run",
"Type",
"Sample type",
"Field",
"Replicate",
"Control",
"Gene"
),
~factor(.)
) %>%
mutate_at(c("Extr. Date", "PCR products_16S_send for seq"), ~as.Date(., "%d.%m.%Y")) ->
Metadata
Metadata$Season %<>% fct_relevel("Winter", "Spring", "Summer", "Autumn")
Metadata$Read1_file <- str_replace(Metadata$Read1_file, "(.*)_L001_R1_001.fastq.gz|\\.1\\.fastq.gz", "\\1")
Metadata <- Metadata[Metadata$Read1_file %in% rownames(abundance_mat), ] # remove metadata rows if the samples did not go through qual processing
# Order abundance_mat samples according to the metadata
sample_order <- match(rownames(abundance_mat), Metadata$Read1_file)
abundance_mat %<>%
rownames_to_column('sample_name') %>%
arrange(., sample_order) %>%
column_to_rownames('sample_name') # needed for phyloseq
Metadata$Library.size <- rowSums(abundance_mat)
Metadata <- data.frame(row.names = Metadata$Read1_file, Metadata)
# read taxonomy from data file
Raw_tax_data <- read_tsv(paste0(data_path, Tax_table),
trim_ws = TRUE, col_names = TRUE)
Raw_tax_data %<>%
.[!(Raw_tax_data$`Seq#` %in% contaminant_seqs$X1),] %>% # remove contaminant OTUs.
mutate_all(funs(replace(., is.na(.), "Unclassified"))) # I think mutaute_all is unnecessary here because replace(., is.na(.), "Unclassified") alone should work
Raw_tax_data %>%
select(.,
Kingdom = V8,
Phylum = V9,
Class = V10,
Order = V11,
Family = V12,
Genus = V13) %>%
cbind(Name = colnames(abundance_mat),. ) ->
Taxonomy.bs
Raw_tax_data %>%
select(.,
Kingdom,
Phylum,
Class,
Order,
Family,
Genus) %>%
# map_dfr(~str_remove_all(., "^.__")) %>% # remove "k__" prefixes
add_column(Seq = colnames(abundance_mat)) %>%
column_to_rownames("Seq") %>%
set_colnames(c("Domain", "Phylum", "Class", "Order", "Family", "Genus")) %>%
as.matrix() -> # because tax_table() needs a matrix for retaining row.names
Taxonomy
# generate phyloseq object
Ps_obj <- phyloseq(otu_table(abundance_mat, taxa_are_rows = FALSE),
tax_table(Taxonomy),
sample_data(Metadata)
)
Ps_obj <- filter_taxa(Ps_obj, function(x) sum(x) > 0, TRUE) # remove 0 abundance taxa
Ps_obj <- subset_samples(Ps_obj, sample_sums(Ps_obj) > 0) # remove 0 abundance samples
# Remove mock and control samples
Ps_obj <- subset_samples(Ps_obj, Type != "Control" & Type != "Mock")
# Create a grouping variable for merging
sample_data(Ps_obj) %<>%
as(., "data.frame") %>%
# get_variable(., c("Sample.type", "Field", "Season", "Replicate")) %>%
unite(., "Description", c("Sample.type", "Field", "Season", "Replicate"), remove = FALSE)
sample_data(Ps_obj)$Description %<>% as_factor(.)
# merged_Ps_obj <- merge_samples(Ps_obj, "Description")
# merged_SD <- merge_samples(sample_data(Ps_obj), "Description")
Ps_obj_merged <- MergeSamples(Ps_obj, grouping_name = "Description")First, let’s look at the taxonomic distribution
table(tax_table(Ps_obj_merged)[, "Domain"], exclude = NULL)##
## Archaea Bacteria Eukaryota Unclassified
## 46 15652 45 71
table(tax_table(Ps_obj_merged)[, "Class"], exclude = NULL)##
## 0319-7L14 ABY1 Acidimicrobiia
## 32 2 469
## Acidobacteriia Actinobacteria AKAU4049
## 111 798 2
## Alphaproteobacteria Anaerolineae Armatimonadia
## 2110 313 7
## AT-s3-28 Babeliae Bacilli
## 3 23 207
## Bacteroidia BD2-11_terrestrial_group BD7-11
## 2173 43 8
## Blastocatellia_(Subgroup_4) Calditrichia Chlamydiae
## 153 2 70
## Chloroflexia Chthonomonadetes Clostridia
## 182 8 47
## Dehalococcoidia Deinococci Deltaproteobacteria
## 44 3 1763
## Elusimicrobia Entotheonellia Fibrobacteria
## 3 36 37
## Fimbriimonadia Fusobacteriia Gammaproteobacteria
## 41 1 1782
## Gemmatimonadetes Gitt-GS-136 Gracilibacteria
## 340 26 8
## Holophagae Hydrogenedentia Ignavibacteria
## 45 1 44
## JG30-KF-CM66 KD4-96 Ktedonobacteria
## 13 69 4
## Latescibacteria Leptospirae Limnochordia
## 9 4 2
## Lineage_IIa Lineage_IIb Longimicrobia
## 17 28 61
## MB-A2-108 Melainabacteria Microgenomatia
## 85 35 5
## Mollicutes NC10 Negativicutes
## 8 36 4
## Nitriliruptoria Nitrososphaeria Nitrospira
## 6 33 16
## OLB14 OM190 Omnitrophia
## 11 114 5
## Oxyphotobacteria P2-11E Parcubacteria
## 88 4 21
## Phycisphaerae Pla3_lineage Pla4_lineage
## 270 20 40
## Planctomycetacia Rhodothermia Rubrobacteria
## 870 1 5
## S0134_terrestrial_group Saccharimonadia Sericytochromatia
## 61 38 15
## Spirochaetia Subgroup_11 Subgroup_15
## 3 8 11
## Subgroup_17 Subgroup_18 Subgroup_20
## 43 7 1
## Subgroup_22 Subgroup_25 Subgroup_5
## 88 18 21
## Subgroup_6 Subgroup_9 Synergistia
## 442 3 1
## Thermoanaerobaculia Thermoleophilia Thermoplasmata
## 96 599 11
## TK10 Unclassified vadinHA49
## 69 604 57
## Verrucomicrobiae Woesearchaeia WWE3
## 793 2 2
# table(tax_table(Ps_obj_merged)[, "Family"], exclude = NULL)Now let’s remove some taxa, which are obvious artefacts or those which aren’t bacteria or archaea
domains2remove <- c("", "Eukaryota", "Unclassified")
orders2remove <- c("Chloroplast")
families2remove <- c("Mitochondria")
Ps_obj_filt <- subset_taxa(Ps_obj_merged, !is.na(Phylum) &
!Domain %in% domains2remove &
!Order %in% orders2remove &
!Family %in% families2remove)
Taxonomy.bs %<>%
filter(Taxonomy.bs$Name %in% row.names(Ps_obj_filt@tax_table))
sample_data(Ps_obj_filt)$Lib.size <- rowSums(otu_table(Ps_obj_filt))Now let’s explore the prevalence of different taxa in the database. Prevalence is the number of samples in which a taxa appears at least once. So “Mean prevalence” refers to in how many samples does a sequence belonging to the phylum appears on average, and “Sum prevalence” is the sum of all samples where any sequence from the phylum appears,
prevdf <- apply(X = otu_table(Ps_obj_filt),
MARGIN = ifelse(taxa_are_rows(Ps_obj_filt), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(Ps_obj_filt),
tax_table(Ps_obj_filt))
prevdf %>%
group_by(Phylum) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_phylum_summary
Prevalence_phylum_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Phylum | Mean prevalence | Sum prevalence |
|---|---|---|
| Acidobacteria | 12.9 | 13700 |
| Actinobacteria | 14.2 | 28380 |
| Armatimonadetes | 6.3 | 534 |
| Bacteroidetes | 13.1 | 29019 |
| BRC1 | 6.8 | 75 |
| Calditrichaeota | 4.5 | 9 |
| Chlamydiae | 2.3 | 161 |
| Chloroflexi | 9.6 | 7138 |
| Cyanobacteria | 4.7 | 344 |
| Deinococcus-Thermus | 1.0 | 3 |
| Dependentiae | 2.2 | 51 |
| Elusimicrobia | 2.9 | 139 |
| Entotheonellaeota | 15.6 | 562 |
| Euryarchaeota | 4.5 | 49 |
| FBP | 8.4 | 126 |
| FCPU426 | 6.0 | 6 |
| Fibrobacteres | 7.4 | 274 |
| Firmicutes | 5.9 | 1523 |
| Fusobacteria | 1.0 | 1 |
| GAL15 | 1.0 | 1 |
| Gemmatimonadetes | 8.1 | 4128 |
| Hydrogenedentes | 19.0 | 19 |
| Latescibacteria | 9.6 | 799 |
| Nanoarchaeaeota | 1.0 | 2 |
| Nitrospirae | 15.8 | 252 |
| Omnitrophicaeota | 1.2 | 6 |
| Patescibacteria | 2.6 | 206 |
| Planctomycetes | 7.7 | 10630 |
| Proteobacteria | 12.9 | 72598 |
| Rokubacteria | 11.9 | 429 |
| Spirochaetes | 2.1 | 15 |
| Synergistetes | 1.0 | 1 |
| Tenericutes | 1.2 | 10 |
| Thaumarchaeota | 27.8 | 919 |
| Unclassified | 5.9 | 1602 |
| Verrucomicrobia | 12.6 | 9973 |
| WPS-2 | 3.4 | 17 |
| WS2 | 5.0 | 5 |
prevdf %>%
group_by(Order) %>%
summarise(`Mean prevalence` = mean(Prevalence),
`Sum prevalence` = sum(Prevalence)) ->
Prevalence_order_summary
Prevalence_order_summary %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Order | Mean prevalence | Sum prevalence |
|---|---|---|
| 11-24 | 9.9 | 597 |
| 211ds20 | 4.3 | 13 |
| Absconditabacteriales\_(SR1) | 4.3 | 13 |
| Acetobacterales | 12.2 | 854 |
| Acidiferrobacterales | 1.3 | 8 |
| Acidobacteriales | 6.2 | 112 |
| Actinomarinales | 18.0 | 395 |
| Actinomycetales | 1.0 | 1 |
| Aeromonadales | 41.0 | 41 |
| Anaerolineales | 6.1 | 245 |
| Ardenticatenales | 4.7 | 155 |
| Armatimonadales | 1.7 | 12 |
| Azospirillales | 22.0 | 922 |
| Babeliales | 2.2 | 51 |
| Bacillales | 6.3 | 1198 |
| Bacteroidales | 1.0 | 6 |
| Bacteroidetes\_VC2.1\_Bac22 | 6.5 | 85 |
| Bdellovibrionales | 6.8 | 2518 |
| Betaproteobacteriales | 18.4 | 12335 |
| Blastocatellales | 13.9 | 682 |
| C0119 | 9.2 | 37 |
| Caedibacterales | 5.8 | 35 |
| Caldilineales | 6.7 | 201 |
| Calditrichales | 4.5 | 9 |
| Candidatus\_Amesbacteria | 1.0 | 1 |
| Candidatus\_Azambacteria | 1.0 | 2 |
| Candidatus\_Campbellbacteria | 1.0 | 1 |
| Candidatus\_Collierbacteria | 1.0 | 2 |
| Candidatus\_Jorgensenbacteria | 1.0 | 2 |
| Candidatus\_Liptonbacteria | 1.0 | 1 |
| Candidatus\_Magasanikbacteria | 1.0 | 1 |
| Candidatus\_Nomurabacteria | 2.7 | 8 |
| Candidatus\_Peribacteria | 1.0 | 3 |
| Candidatus\_Woesebacteria | 1.0 | 1 |
| Candidatus\_Yanofskybacteria | 1.0 | 1 |
| Candidatus\_Yonathbacteria | 1.0 | 2 |
| Caulobacterales | 21.8 | 3442 |
| CCD24 | 27.7 | 360 |
| CCM11a | 3.6 | 36 |
| CCM19a | 13.0 | 13 |
| Cellvibrionales | 15.5 | 557 |
| Chitinophagales | 12.1 | 9920 |
| Chlamydiales | 2.3 | 161 |
| Chloroflexales | 6.8 | 688 |
| Chthoniobacterales | 10.9 | 2797 |
| Chthonomonadales | 5.6 | 45 |
| Clostridiales | 3.4 | 160 |
| Corynebacteriales | 15.4 | 1143 |
| Coxiellales | 1.0 | 1 |
| Cytophagales | 14.4 | 6477 |
| Deinococcales | 1.0 | 3 |
| Desulfarculales | 8.1 | 331 |
| Desulfuromonadales | 9.8 | 39 |
| Diplorickettsiales | 2.1 | 196 |
| Dongiales | 38.2 | 687 |
| DS-100 | 10.5 | 116 |
| Elsterales | 8.3 | 200 |
| EMP-G18 | 1.0 | 2 |
| Enterobacteriales | 19.4 | 775 |
| Entotheonellales | 15.6 | 562 |
| EPR3968-O8a-Bc78 | 1.0 | 1 |
| Euzebyales | 9.5 | 57 |
| EV818SWSAP88 | 1.0 | 1 |
| Fibrobacterales | 7.4 | 274 |
| Fimbriimonadales | 8.1 | 334 |
| Flavobacteriales | 12.9 | 5003 |
| Frankiales | 18.3 | 1337 |
| Fusobacteriales | 1.0 | 1 |
| Gaiellales | 13.6 | 3144 |
| Gammaproteobacteria\_Incertae\_Sedis | 14.0 | 896 |
| Gemmatales | 6.1 | 2924 |
| Gemmatimonadales | 8.6 | 2922 |
| Glycomycetales | 22.0 | 44 |
| Holosporales | 1.5 | 6 |
| Hydrogenedentiales | 19.0 | 19 |
| Ignavibacteriales | 1.0 | 2 |
| IMCC26256 | 7.0 | 640 |
| Isosphaerales | 9.2 | 333 |
| Kallotenuales | 3.8 | 87 |
| KF-JG30-C25 | 10.0 | 10 |
| KI89A\_clade | 22.0 | 44 |
| Kineosporiales | 26.1 | 365 |
| Kryptoniales | 5.8 | 35 |
| Lactobacillales | 7.0 | 112 |
| Latescibacterales | 8.7 | 78 |
| Legionellales | 2.0 | 123 |
| Leptospirales | 2.2 | 9 |
| Limnochordales | 1.0 | 2 |
| Longimicrobiales | 4.2 | 257 |
| MBNT15 | 8.9 | 89 |
| Methylacidiphilales | 7.9 | 174 |
| Methylococcales | 16.0 | 32 |
| Micavibrionales | 4.3 | 608 |
| Micrococcales | 24.2 | 4481 |
| Micromonosporales | 21.1 | 2614 |
| Micropepsales | 20.4 | 306 |
| Microtrichales | 9.8 | 2840 |
| mle1-8 | 16.8 | 67 |
| MVP-88 | 1.0 | 3 |
| Mycoplasmatales | 2.5 | 5 |
| Myxococcales | 10.2 | 11013 |
| NB1-j | 9.9 | 582 |
| Nitrosococcales | 8.3 | 25 |
| Nitrososphaerales | 27.8 | 919 |
| Nitrospirales | 15.8 | 252 |
| NKB5 | 1.0 | 1 |
| Nostocales | 4.5 | 77 |
| Obscuribacterales | 5.4 | 86 |
| Oceanospirillales | 15.7 | 470 |
| Oligoflexales | 4.1 | 565 |
| Omnitrophales | 1.2 | 6 |
| OPB56 | 7.8 | 163 |
| Opitutales | 16.5 | 2420 |
| Oxyphotobacteria\_Incertae\_Sedis | 1.0 | 3 |
| Paracaedibacterales | 6.4 | 115 |
| Parvibaculales | 2.0 | 4 |
| PB19 | 14.5 | 58 |
| Pedosphaerales | 10.1 | 1713 |
| PeM15 | 1.0 | 2 |
| Phormidesmiales | 1.0 | 3 |
| Phycisphaerales | 5.6 | 611 |
| Pirellulales | 10.0 | 2517 |
| Piscirickettsiales | 1.0 | 1 |
| Pla1\_lineage | 1.7 | 5 |
| Planctomycetales | 8.9 | 841 |
| PLTA13 | 13.2 | 661 |
| Propionibacteriales | 17.1 | 2348 |
| Pseudomonadales | 17.9 | 1751 |
| Pseudonocardiales | 17.7 | 1202 |
| Puniceispirillales | 8.2 | 66 |
| Pyrinomonadales | 20.2 | 666 |
| R7C24 | 9.5 | 971 |
| RBG-13-54-9 | 8.6 | 138 |
| RCP2-54 | 15.0 | 270 |
| Reyranellales | 18.8 | 675 |
| Rhizobiales | 19.6 | 13223 |
| Rhodobacterales | 10.4 | 510 |
| Rhodospirillales | 7.6 | 426 |
| Rhodothermales | 2.0 | 2 |
| Rhodovibrionales | 1.0 | 1 |
| Rickettsiales | 4.1 | 509 |
| Rokubacteriales | 11.9 | 429 |
| Rubrobacterales | 40.8 | 204 |
| S-70 | 1.0 | 1 |
| S-BQ2-57\_soil\_group | 1.0 | 2 |
| S085 | 12.8 | 513 |
| Saccharimonadales | 3.8 | 145 |
| Salinisphaerales | 7.5 | 383 |
| SAR202\_clade | 8.5 | 17 |
| SAR324\_clade(Marine\_group\_B) | 4.7 | 14 |
| SBR1031 | 9.4 | 1806 |
| sediment-surface35 | 1.0 | 1 |
| Selenomonadales | 11.8 | 47 |
| SJA-28 | 7.6 | 99 |
| SM1A07 | 10.5 | 42 |
| Sneathiellales | 11.5 | 69 |
| Solibacterales | 10.4 | 898 |
| Solirubrobacterales | 11.7 | 4118 |
| Sphingobacteriales | 14.8 | 6945 |
| Sphingomonadales | 18.5 | 7650 |
| Spirochaetales | 2.0 | 6 |
| Steroidobacterales | 13.9 | 1471 |
| Streptomycetales | 18.2 | 911 |
| Streptosporangiales | 7.6 | 434 |
| Subgroup\_13 | 1.0 | 1 |
| Subgroup\_2 | 5.5 | 33 |
| Subgroup\_7 | 12.3 | 555 |
| Synergistales | 1.0 | 1 |
| Syntrophobacterales | 1.6 | 11 |
| Tepidisphaerales | 9.8 | 1379 |
| Thermoanaerobaculales | 8.1 | 780 |
| Thermomicrobiales | 10.4 | 595 |
| Tistrellales | 13.1 | 692 |
| UA11 | 4.1 | 53 |
| Unclassified | 9.3 | 21034 |
| Unknown\_Order | 17.8 | 446 |
| Vampirovibrionales | 6.0 | 114 |
| Verrucomicrobiales | 16.2 | 2741 |
| Xanthomonadales | 14.3 | 3655 |
Based on that I’ll remove all phyla with a prevalence of under 10
Prevalence_phylum_summary %>%
filter(`Sum prevalence` < 10) %>%
select(Phylum) %>%
map(as.character) %>%
unlist() ->
filterPhyla
Ps_obj_filt2 <- subset_taxa(Ps_obj_filt, !Phylum %in% filterPhyla)
Taxonomy.bs %<>%
filter(Taxonomy.bs$Name %in% row.names(Ps_obj_filt2@tax_table))
sample_data(Ps_obj_filt2)$Lib.size <- rowSums(otu_table(Ps_obj_filt2))
print(Ps_obj_filt)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 15568 taxa and 124 samples ]
## sample_data() Sample Data: [ 124 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 15568 taxa by 6 taxonomic ranks ]
print(Ps_obj_filt2)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 15551 taxa and 124 samples ]
## sample_data() Sample Data: [ 124 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 15551 taxa by 6 taxonomic ranks ]
Plot general prevalence features of the phyla
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf, Phylum %in% get_taxa_unique(Ps_obj_filt2, "Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt2), color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) + theme(legend.position = "none")Plot general prevalence features of the top 20 orders
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf, Order %in% get_taxa_unique(Ps_obj_filt2, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt2), color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) + theme(legend.position = "none")Plot general prevalence features of the phyla excluding soil samples
Ps_obj_filt_Teas <- subset_samples(Ps_obj_filt2, Type != "Soil")
prevdf_teas <- apply(X = otu_table(Ps_obj_filt_Teas),
MARGIN = ifelse(taxa_are_rows(Ps_obj_filt_Teas), yes = 1, no = 2),
FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf_teas <- data.frame(Prevalence = prevdf_teas,
TotalAbundance = taxa_sums(Ps_obj_filt_Teas),
tax_table(Ps_obj_filt_Teas))
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf_teas, Phylum %in% get_taxa_unique(Ps_obj_filt_Teas, "Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt_Teas), color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) + theme(legend.position = "none")Plot general prevalence features of the top 20 orders (teabag samples only)
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf_teas, Order %in% get_taxa_unique(Ps_obj_filt_Teas, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt_Teas), color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) + theme(legend.position = "none") #### Unsupervised filtering by prevalence I’ll remove all sequences which appear in less than 5% of the samples
# Define prevalence threshold as 5% of total samples
prevalenceThreshold <- 0.05 * nsamples(Ps_obj_filt)
prevalenceThreshold## [1] 6.2
# Execute prevalence filter, using `prune_taxa()` function
keepTaxa <-
row.names(prevdf_phylum_filt)[(prevdf_phylum_filt$Prevalence >= prevalenceThreshold)]
Ps_obj_filt3 <- prune_taxa(keepTaxa, Ps_obj_filt2)
Taxonomy.bs %<>%
filter(Taxonomy.bs$Name %in% row.names(Ps_obj_filt3@tax_table))
sample_data(Ps_obj_filt3)$Lib.size <- rowSums(otu_table(Ps_obj_filt3))
print(Ps_obj_filt2)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 15551 taxa and 124 samples ]
## sample_data() Sample Data: [ 124 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 15551 taxa by 6 taxonomic ranks ]
print(Ps_obj_filt3)## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 4217 taxa and 124 samples ]
## sample_data() Sample Data: [ 124 samples by 26 sample variables ]
## tax_table() Taxonomy Table: [ 4217 taxa by 6 taxonomic ranks ]
This removed 11334 or 73% of the ESVs!.
However all these removed ESVs accounted for only:
prevdf_phylum_filt %>%
arrange(., Prevalence) %>%
group_by(Prevalence > prevalenceThreshold) %>%
summarise(Abundance = sum(TotalAbundance)) %>%
mutate(`Rel. Ab.` = percent(Abundance / sum(Abundance))) %>%
kable(., digits = c(0, 1, 0)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Prevalence > prevalenceThreshold | Abundance | Rel. Ab. |
|---|---|---|
| FALSE | 99783 | 2% |
| TRUE | 4207003 | 98% |
So it’s fine to remove them.
Let’s make a fasta file after filtering the sequences:
Seq_file <- "DADA2.Seqs.fa"
readDNAStringSet(paste0(data_path, Seq_file)) %>%
.[taxa_names(Ps_obj_filt3)] %>%
writeXStringSet(., filepath = paste0(data_path, str_remove(Seq_file, ".fa*"), "_filtered.fa"), format = "fasta", width = 1000)Plot general prevalence features of the phyla after filtering
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf, Phylum %in% get_taxa_unique(Ps_obj_filt3, "Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt3), color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) + theme(legend.position = "none")Plot general prevalence features of the top 20 orders
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf, Order %in% get_taxa_unique(Ps_obj_filt3, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt3), color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) + theme(legend.position = "none")Plot general prevalence features of the phyla excluding soil samples
# Subset to the remaining phyla
prevdf_phylum_filt_teas <- subset(prevdf_teas, Phylum %in% get_taxa_unique(Ps_obj_filt_Teas, "Phylum"))
ggplot(prevdf_phylum_filt,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt_Teas), color = Phylum)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Phylum) + theme(legend.position = "none")Plot general prevalence features of the top 20 orders
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf_teas, Order %in% get_taxa_unique(Ps_obj_filt_Teas, "Order"))
# grab the top 30 most abundant orders
prevdf_order_filt %>%
group_by(Order) %>%
summarise(Combined.abundance = sum(TotalAbundance)) %>%
arrange(desc(Combined.abundance)) %>%
.[1:30, "Order"] ->
Orders2plot
prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)
ggplot(prevdf_order_filt2,
aes(TotalAbundance, Prevalence / nsamples(Ps_obj_filt_Teas), color = Order)) +
# Include a guess for parameter
geom_hline(yintercept = 0.05,
alpha = 0.5,
linetype = 2) + geom_point(size = 2, alpha = 0.7) +
scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
facet_wrap( ~ Order) + theme(legend.position = "none")# remove samples with reads < min_lib_size
Ps_obj_filt3 %>%
subset_samples(., sample_sums(Ps_obj_filt3) > min_lib_size) %>%
filter_taxa(., function(x)
sum(x) > 0, TRUE) ->
# transform_sample_counts(., function(x)
# x / sum(x) * 100) ->
Ps_obj_filt3_subset
Ps_obj_filt3_subset %>%
get_variable() %>%
mutate_at(., "Sample.type",
~fct_relevel(., levels = c("Soil", "Green tea", "Rooibos"))) %>%
mutate_at(., "Season",
~fct_relevel(., levels = c("Winter", "Spring", "Summer", "Autumn"))) %>%
arrange(Field, Sample.type, Season, Replicate) %>%
pull(Description) %>%
as.character() ->
Sample.order
Ps_obj_filt3_subset_ra <- transform_sample_counts(Ps_obj_filt3_subset, function(x){x / sum(x)} * 100)
# grabthe top 100 most abundant OTUs
Ps_obj_filt3_subset_100 <-
prune_taxa(names(sort(taxa_sums(Ps_obj_filt3_subset_ra), TRUE)[1:100]), Ps_obj_filt3_subset_ra)
plot_heatmap(
Ps_obj_filt3_subset_100,
method = NULL,
distance = NULL,
sample.label = "Description",
taxa.label = "Order",
sample.order = Sample.order,
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Phylum == "Proteobacteria") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_proteo
plot_heatmap(
Ps_obj_filt3_subset_proteo,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Phylum == "Actinobacteria") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_actino
plot_heatmap(
Ps_obj_filt3_subset_actino,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Phylum == "Bacteroidetes") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_bacter
plot_heatmap(
Ps_obj_filt3_subset_bacter,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
) Ps_obj_filt3_subset_ra %>%
subset_taxa(., Phylum == "Acidobacteria") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_acido
plot_heatmap(
Ps_obj_filt3_subset_acido,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Phylum",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Order == "Chitinophagales") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_chitino
plot_heatmap(
Ps_obj_filt3_subset_chitino,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Order == "Rhizobiales") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_rhizo
plot_heatmap(
Ps_obj_filt3_subset_rhizo,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Order == "Betaproteobacteriales") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_beta
plot_heatmap(
Ps_obj_filt3_subset_beta,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
)Ps_obj_filt3_subset_ra %>%
subset_taxa(., Order == "Myxococcales") %>%
transform_sample_counts(., function(x) x / sum(x) * 100) ->
Ps_obj_filt3_subset_myxo
plot_heatmap(
Ps_obj_filt3_subset_myxo,
method = NULL,
distance = NULL,
sample.label = "Description",
sample.order = Sample.order,
taxa.label = "Order",
low = "#000033",
high = "#FF3300"
) Let’s look at the agglomerated taxa
Ps_obj_filt3_subset_ra %>%
tax_glom(., "Phylum", NArm = TRUE) ->
Ps_obj_filt_glom
plot_heatmap(
Ps_obj_filt_glom,
# method = "NMDS",
# distance = "bray",
sample.order = Sample.order,
sample.label = "Description",
taxa.label = "Phylum",
taxa.order = "Phylum",
low = "#000033",
high = "#FF3300"
) + theme_bw(base_size = 20) + theme(axis.text.x = element_text(hjust = 1.0, angle = 90.0))PlotAbundance(Ps_obj_filt3_subset_ra, phylum2plot = "Bacteroidetes")# plotBefore <- PlotAbundance(Ps_obj_filt3_subset, taxa = "Proteobacteria")
# plotAfter <- PlotAbundance(Ps_obj_filt3_subset_ra, taxa = "Proteobacteria")
# Combine each plot into one graphic.
# grid.arrange(nrow = 2, plotBefore, plotAfter)Ps_obj_filt3_subset_ra_taxa <- subset_taxa(Ps_obj_filt3_subset_ra, Order == "Flavobacteriales")
PlotAbundance(Ps_obj_filt3_subset_ra_taxa, phylum2plot = "Bacteroidetes", Facet = "Genus")Ps_obj_filt3_subset_df <- psmelt(Ps_obj_filt3_subset)
# Ps_obj_filt3_subset_df %>%
# filter(Species == "Epibolus pulchripes") ->
# Ps_obj_filt3_subset_df_EP
# PlotViolin(Ps_obj_filt3_subset_df_EP)
PlotViolin(Ps_obj_filt3_subset_df)Ps_obj_filt3_glom <- tax_glom(Ps_obj_filt3_subset_ra,
"Phylum",
NArm = TRUE)
Ps_obj_filt3_glom_DF <- psmelt(Ps_obj_filt3_glom)
Ps_obj_filt3_glom_DF$Phylum %<>% as.character()
# group dataframe by Phylum, calculate median rel. abundance
Ps_obj_filt3_glom_DF %>%
group_by(Phylum) %>%
summarise(median = median(Abundance)) ->
medians
# find Phyla whose rel. abund. is less than 0.5%
Rare_phyla <- medians[medians$median <= 0.005, ]$Phylum
# change their name to "Rare"
Ps_obj_filt3_glom_DF[Ps_obj_filt3_glom_DF$Phylum %in% Rare_phyla, ]$Phylum <- 'Rare'
# re-group
Ps_obj_filt3_glom_DF %>%
group_by(Sample, Sample.name, Phylum, Sample.type, Field) %>%
summarise(Abundance = sum(Abundance)) ->
Ps_obj_filt3_glom_DF_2plot
# ab.taxonomy$Freq <- sqrt(ab.taxonomy$Freq)
# Ps_obj_filt3_glom_rel_DF$Phylum %<>% sub("unclassified", "Unclassified", .)
# Ps_obj_filt3_glom_rel_DF$Phylum %<>% sub("uncultured", "Unclassified", .)
Ps_obj_filt3_glom_DF_2plot %>%
group_by(Sample) %>%
filter(Phylum == "Rare") %>%
summarise(`Rares (%)` = sum(Abundance * 100)) ->
Rares
# Percentage of reads classified as rare
Rares %>%
kable(., digits = c(2), caption = "Percentage of reads per sample classified as rare:") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Sample | Rares (%) |
|---|---|
| Green\_tea\_Cambisol\_Autumn\_1 | 1.91 |
| Green\_tea\_Cambisol\_Autumn\_2 | 1.76 |
| Green\_tea\_Cambisol\_Autumn\_3 | 1.49 |
| Green\_tea\_Cambisol\_Autumn\_4 | 2.26 |
| Green\_tea\_Cambisol\_Spring\_1 | 22.50 |
| Green\_tea\_Cambisol\_Spring\_2 | 0.91 |
| Green\_tea\_Cambisol\_Spring\_3 | 23.93 |
| Green\_tea\_Cambisol\_Spring\_4 | 6.89 |
| Green\_tea\_Cambisol\_Summer\_1 | 10.15 |
| Green\_tea\_Cambisol\_Summer\_2 | 10.32 |
| Green\_tea\_Cambisol\_Summer\_3 | 22.58 |
| Green\_tea\_Cambisol\_Summer\_4 | 5.29 |
| Green\_tea\_Cambisol\_Winter\_1 | 12.78 |
| Green\_tea\_Cambisol\_Winter\_2 | 6.25 |
| Green\_tea\_Cambisol\_Winter\_3 | 6.32 |
| Green\_tea\_Cambisol\_Winter\_4 | 8.15 |
| Green\_tea\_Fluvisol\_Autumn\_1 | 2.95 |
| Green\_tea\_Fluvisol\_Autumn\_2 | 1.10 |
| Green\_tea\_Fluvisol\_Autumn\_3 | 4.85 |
| Green\_tea\_Fluvisol\_Autumn\_4 | 0.99 |
| Green\_tea\_Fluvisol\_Spring\_1 | 2.42 |
| Green\_tea\_Fluvisol\_Spring\_2 | 7.07 |
| Green\_tea\_Fluvisol\_Spring\_3 | 10.12 |
| Green\_tea\_Fluvisol\_Spring\_4 | 0.00 |
| Green\_tea\_Fluvisol\_Summer\_1 | 21.22 |
| Green\_tea\_Fluvisol\_Summer\_2 | 9.25 |
| Green\_tea\_Fluvisol\_Summer\_3 | 7.97 |
| Green\_tea\_Fluvisol\_Summer\_4 | 15.90 |
| Green\_tea\_Fluvisol\_Winter\_1 | 2.80 |
| Green\_tea\_Fluvisol\_Winter\_2 | 10.17 |
| Green\_tea\_Fluvisol\_Winter\_3 | 5.18 |
| Green\_tea\_Fluvisol\_Winter\_4 | 2.75 |
| Green\_tea\_Luvisol\_Autumn\_1 | 0.35 |
| Green\_tea\_Luvisol\_Autumn\_2 | 1.03 |
| Green\_tea\_Luvisol\_Autumn\_3 | 0.70 |
| Green\_tea\_Luvisol\_Autumn\_4 | 17.77 |
| Green\_tea\_Luvisol\_Spring\_1 | 0.00 |
| Green\_tea\_Luvisol\_Spring\_3 | 0.00 |
| Green\_tea\_Luvisol\_Spring\_4 | 1.27 |
| Green\_tea\_Luvisol\_Summer\_2 | 16.18 |
| Green\_tea\_Luvisol\_Summer\_3 | 8.81 |
| Green\_tea\_Luvisol\_Winter\_1 | 30.26 |
| Green\_tea\_Luvisol\_Winter\_2 | 28.45 |
| Green\_tea\_Luvisol\_Winter\_3 | 11.62 |
| Green\_tea\_Luvisol\_Winter\_4 | 13.95 |
| Green\_tea\_Unburied\_Winter\_1 | 73.37 |
| Rooibos\_Cambisol\_Autumn\_1 | 2.68 |
| Rooibos\_Cambisol\_Autumn\_2 | 0.80 |
| Rooibos\_Cambisol\_Autumn\_3 | 4.50 |
| Rooibos\_Cambisol\_Autumn\_4 | 0.36 |
| Rooibos\_Cambisol\_Spring\_1 | 0.73 |
| Rooibos\_Cambisol\_Spring\_2 | 8.71 |
| Rooibos\_Cambisol\_Spring\_3 | 0.00 |
| Rooibos\_Cambisol\_Spring\_4 | 0.79 |
| Rooibos\_Cambisol\_Summer\_1 | 69.14 |
| Rooibos\_Cambisol\_Summer\_2 | 83.05 |
| Rooibos\_Cambisol\_Summer\_3 | 10.11 |
| Rooibos\_Cambisol\_Summer\_4 | 32.78 |
| Rooibos\_Cambisol\_Winter\_1 | 4.12 |
| Rooibos\_Cambisol\_Winter\_2 | 10.99 |
| Rooibos\_Cambisol\_Winter\_3 | 10.41 |
| Rooibos\_Cambisol\_Winter\_4 | 18.00 |
| Rooibos\_Fluvisol\_Autumn\_1 | 2.27 |
| Rooibos\_Fluvisol\_Autumn\_2 | 1.86 |
| Rooibos\_Fluvisol\_Autumn\_3 | 6.46 |
| Rooibos\_Fluvisol\_Autumn\_4 | 8.59 |
| Rooibos\_Fluvisol\_Spring\_1 | 0.93 |
| Rooibos\_Fluvisol\_Spring\_2 | 14.71 |
| Rooibos\_Fluvisol\_Spring\_3 | 1.62 |
| Rooibos\_Fluvisol\_Spring\_4 | 0.00 |
| Rooibos\_Fluvisol\_Summer\_1 | 27.02 |
| Rooibos\_Fluvisol\_Summer\_2 | 24.94 |
| Rooibos\_Fluvisol\_Summer\_3 | 18.05 |
| Rooibos\_Fluvisol\_Summer\_4 | 19.01 |
| Rooibos\_Fluvisol\_Winter\_1 | 9.39 |
| Rooibos\_Fluvisol\_Winter\_2 | 31.21 |
| Rooibos\_Fluvisol\_Winter\_3 | 10.02 |
| Rooibos\_Fluvisol\_Winter\_4 | 18.90 |
| Rooibos\_Luvisol\_Autumn\_1 | 0.00 |
| Rooibos\_Luvisol\_Autumn\_2 | 28.30 |
| Rooibos\_Luvisol\_Autumn\_3 | 6.80 |
| Rooibos\_Luvisol\_Autumn\_4 | 1.80 |
| Rooibos\_Luvisol\_Spring\_1 | 5.13 |
| Rooibos\_Luvisol\_Spring\_2 | 11.37 |
| Rooibos\_Luvisol\_Spring\_3 | 1.21 |
| Rooibos\_Luvisol\_Spring\_4 | 9.93 |
| Rooibos\_Luvisol\_Summer\_1 | 4.94 |
| Rooibos\_Luvisol\_Summer\_2 | 13.41 |
| Rooibos\_Luvisol\_Summer\_4 | 7.08 |
| Rooibos\_Luvisol\_Winter\_1 | 7.42 |
| Rooibos\_Luvisol\_Winter\_2 | 18.50 |
| Rooibos\_Luvisol\_Winter\_3 | 4.64 |
| Rooibos\_Luvisol\_Winter\_4 | 10.43 |
| Rooibos\_Unburied\_Winter\_1 | 16.60 |
| Soil\_Cambisol\_Autumn\_1 | 201.50 |
| Soil\_Cambisol\_Autumn\_2 | 141.37 |
| Soil\_Cambisol\_Autumn\_3 | 148.25 |
| Soil\_Cambisol\_Spring\_1 | 84.01 |
| Soil\_Cambisol\_Spring\_2 | 97.74 |
| Soil\_Cambisol\_Summer\_1 | 161.58 |
| Soil\_Cambisol\_Summer\_2 | 128.35 |
| Soil\_Cambisol\_Summer\_3 | 138.61 |
| Soil\_Cambisol\_Winter\_1 | 104.15 |
| Soil\_Cambisol\_Winter\_2 | 102.26 |
| Soil\_Fluvisol\_Autumn\_2 | 216.91 |
| Soil\_Fluvisol\_Autumn\_3 | 245.48 |
| Soil\_Fluvisol\_Spring\_1 | 185.35 |
| Soil\_Fluvisol\_Spring\_2 | 239.95 |
| Soil\_Fluvisol\_Summer\_1 | 274.82 |
| Soil\_Fluvisol\_Summer\_2 | 227.69 |
| Soil\_Fluvisol\_Summer\_3 | 261.46 |
| Soil\_Fluvisol\_Winter\_1 | 266.89 |
| Soil\_Fluvisol\_Winter\_2 | 166.05 |
| Soil\_Luvisol\_Autumn\_1 | 208.92 |
| Soil\_Luvisol\_Autumn\_2 | 184.14 |
| Soil\_Luvisol\_Autumn\_3 | 395.29 |
| Soil\_Luvisol\_Spring\_1 | 190.94 |
| Soil\_Luvisol\_Spring\_2 | 156.35 |
| Soil\_Luvisol\_Summer\_1 | 221.86 |
| Soil\_Luvisol\_Summer\_2 | 182.94 |
| Soil\_Luvisol\_Summer\_3 | 216.44 |
| Soil\_Luvisol\_Winter\_1 | 102.42 |
| Soil\_Luvisol\_Winter\_2 | 225.64 |
sample_order <- match(Rares$Sample, row.names(sample_data(Ps_obj_filt3_glom)))
Rares %<>% arrange(., sample_order)
Rares %>%
cbind(., sample_data(Ps_obj_filt3_glom)) %>%
group_by(Sample.type) %>%
summarise(`Rares (%)` = mean(`Rares (%)`)) ->
Rares_merged
# Percentage of reads classified as rare
Rares_merged %>%
kable(., digits = c(2), caption = "Percentage of reads per sample classified as rare:") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)| Sample.type | Rares (%) |
|---|---|
| Green tea | 9.83 |
| Rooibos | 12.49 |
| Soil | 188.88 |
Ps_obj_filt3_glom_DF_2plot %>%
group_by(Phylum) %>%
summarise(sum.Taxa = sum(Abundance)) %>%
arrange(desc(sum.Taxa)) -> Taxa.rank
Ps_obj_filt3_glom_DF_2plot$Phylum %<>%
factor(., levels = Taxa.rank$Phylum) %>%
fct_relevel(., "Rare", after = Inf)
p_taxa_box <-
ggplot(Ps_obj_filt3_glom_DF_2plot, aes(x = Phylum, y = (Abundance))) +
geom_boxplot(aes(group = interaction(Phylum, Sample.type)), position = position_dodge(width = 0.9), fatten = 1) +
geom_point(
aes(colour = Sample.type),
position = position_jitterdodge(dodge.width = 1),
alpha = 1 / 2,
stroke = 0,
size = 2
) +
# scale_colour_manual(values = pom4, name = "") +
theme_cowplot(font_size = 11, font_family = f_name) +
labs(x = NULL, y = "Relative abundance (%)") +
guides(colour = guide_legend(override.aes = list(size = 5))) +
facet_grid(Field ~ .) +
background_grid(major = "xy",
minor = "none") +
theme(axis.text.x = element_text(
angle = 45,
vjust = 0.9,
hjust = 0.9
))
print(p_taxa_box)dir.create(paste0(data_path, "Krona/"))
for (i in seq(nsamples(Ps_obj_filt3_subset))) {
sample_data <-
data.frame(Abundance = get_taxa(Ps_obj_filt3_subset, i), as(tax_table(Ps_obj_filt3_subset), "matrix"))
sample_data <- sample_data[sample_data[, 1] > 0, ]
write_tsv(sample_data, paste0(data_path, "Krona/", sample_names(Ps_obj_filt3_subset)[i], ".tsv"))
}
sample_data(Ps_obj_filt3_subset)$Sample.type %<>% fct_recode(., Green_tea = "Green tea")
sample_data(Ps_obj_filt3_subset)$Sample.type_field_season <- paste0(
get_variable(Ps_obj_filt3_subset, "Sample.type"),
"_",
get_variable(Ps_obj_filt3_subset, "Field"),
"_",
get_variable(Ps_obj_filt3_subset, "Season")
)
Ps_obj_filt3_subset %>%
phyloseq::merge_samples(., "Sample.type_field_season", fun = mean) ->
merged.Ps_obj
for (i in seq(nsamples(merged.Ps_obj))) {
sample_data <-
data.frame(Abundance = get_taxa(merged.Ps_obj, i), as(tax_table(merged.Ps_obj), "matrix"))
sample_data <- sample_data[sample_data[, 1] > 0, ]
write_tsv(sample_data, paste0(data_path, "Krona/", sample_names(merged.Ps_obj)[i], ".tsv"))
}
list.files(paste0(data_path, "Krona/"), full.names = TRUE) %>%
paste(., collapse = " ") %>%
paste0("/usr/local/bin/ktImportText ", .,
" -o ",
data_path,
"Krona/TeaTime_Krona.html") %>%
system()saveRDS(Ps_obj_filt, file = paste0(data_path, Proj_name, "16S_filt.RDS"))
saveRDS(Ps_obj_filt3, file = paste0(data_path, Proj_name, "16S_filt3.RDS"))sessioninfo::session_info() %>%
details::details(
summary = 'Current session info',
open = TRUE
)─ Session info ─────────────────────────────────────────────────────────────────────────
setting value
version R version 4.0.3 (2020-10-10)
os Ubuntu 18.04.5 LTS
system x86_64, linux-gnu
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Prague
date 2021-05-21
─ Packages ─────────────────────────────────────────────────────────────────────────────
package * version date lib source
ade4 1.7-16 2020-10-28 [1] CRAN (R 4.0.2)
ape 5.5 2021-04-25 [1] CRAN (R 4.0.3)
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2)
backports 1.2.1 2020-12-09 [1] CRAN (R 4.0.2)
Biobase 2.50.0 2020-10-27 [1] Bioconductor
BiocGenerics * 0.36.1 2021-04-16 [1] Bioconductor
biomformat 1.18.0 2020-10-27 [1] Bioconductor
Biostrings * 2.58.0 2020-10-27 [1] Bioconductor
broom * 0.7.6 2021-04-05 [1] CRAN (R 4.0.3)
cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.2)
cli 2.5.0 2021-04-26 [1] CRAN (R 4.0.3)
clipr 0.7.1 2020-10-08 [1] CRAN (R 4.0.2)
cluster 2.1.2 2021-04-17 [1] CRAN (R 4.0.3)
codetools 0.2-18 2020-11-04 [1] CRAN (R 4.0.2)
colorspace 2.0-1 2021-05-04 [1] CRAN (R 4.0.3)
cowplot * 1.1.1 2020-12-30 [1] CRAN (R 4.0.2)
crayon 1.4.1 2021-02-08 [1] CRAN (R 4.0.3)
data.table 1.14.0 2021-02-21 [1] CRAN (R 4.0.3)
DBI 1.1.1 2021-01-15 [1] CRAN (R 4.0.3)
dbplyr 2.1.1 2021-04-06 [1] CRAN (R 4.0.3)
desc 1.3.0 2021-03-05 [1] CRAN (R 4.0.3)
details 0.2.1 2020-01-12 [1] CRAN (R 4.0.2)
digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.2)
doParallel * 1.0.16 2020-10-16 [1] CRAN (R 4.0.2)
dplyr * 1.0.6 2021-05-05 [1] CRAN (R 4.0.3)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.0.3)
evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.2)
extrafont * 0.17 2014-12-08 [1] CRAN (R 4.0.2)
extrafontdb 1.0 2012-06-11 [1] CRAN (R 4.0.2)
fansi 0.4.2 2021-01-15 [1] CRAN (R 4.0.3)
farver 2.1.0 2021-02-28 [1] CRAN (R 4.0.3)
forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.0.3)
foreach * 1.5.1 2020-10-15 [1] CRAN (R 4.0.2)
fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.2)
ggplot2 * 3.3.3 2020-12-30 [1] CRAN (R 4.0.2)
glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.0.2)
gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2)
haven 2.4.1 2021-04-23 [1] CRAN (R 4.0.3)
highr 0.9 2021-04-16 [1] CRAN (R 4.0.3)
hms 1.1.0 2021-05-17 [1] CRAN (R 4.0.3)
htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
httr 1.4.2 2020-07-20 [1] CRAN (R 4.0.2)
igraph 1.2.6 2020-10-06 [1] CRAN (R 4.0.2)
IRanges * 2.24.1 2020-12-12 [1] Bioconductor
iterators * 1.0.13 2020-10-15 [1] CRAN (R 4.0.2)
jsonlite 1.7.2 2020-12-09 [1] CRAN (R 4.0.2)
kableExtra * 1.3.4 2021-02-20 [1] CRAN (R 4.0.3)
knitr * 1.33 2021-04-24 [1] CRAN (R 4.0.3)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.0.2)
lattice * 0.20-44 2021-05-02 [1] CRAN (R 4.0.3)
lifecycle 1.0.0 2021-02-15 [1] CRAN (R 4.0.3)
lubridate 1.7.10 2021-02-26 [1] CRAN (R 4.0.3)
magrittr * 2.0.1 2020-11-17 [1] CRAN (R 4.0.2)
MASS 7.3-54 2021-05-03 [1] CRAN (R 4.0.3)
Matrix 1.3-3 2021-05-04 [1] CRAN (R 4.0.3)
mgcv 1.8-35 2021-04-18 [1] CRAN (R 4.0.3)
modelr 0.1.8 2020-05-19 [1] CRAN (R 4.0.2)
multtest 2.46.0 2020-10-27 [1] Bioconductor
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2)
my.tools * 0.5 2020-09-30 [1] local
nlme 3.1-152 2021-02-04 [1] CRAN (R 4.0.3)
permute * 0.9-5 2019-03-12 [1] CRAN (R 4.0.2)
phyloseq * 1.34.0 2020-10-27 [1] Bioconductor
pillar 1.6.1 2021-05-16 [1] CRAN (R 4.0.3)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2)
plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2)
png 0.1-7 2013-12-03 [1] CRAN (R 4.0.2)
prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.2)
progress 1.2.2 2019-05-16 [1] CRAN (R 4.0.2)
purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2)
R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.2)
ragg * 1.1.2 2021-03-17 [1] CRAN (R 4.0.3)
Rcpp 1.0.6 2021-01-15 [1] CRAN (R 4.0.3)
readr * 1.4.0 2020-10-05 [1] CRAN (R 4.0.2)
readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.2)
reprex 2.0.0 2021-04-02 [1] CRAN (R 4.0.3)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.0.2)
rhdf5 2.34.0 2020-10-27 [1] Bioconductor
rhdf5filters 1.2.1 2021-05-03 [1] Bioconductor
Rhdf5lib 1.12.1 2021-01-26 [1] Bioconductor
rlang 0.4.11 2021-04-30 [1] CRAN (R 4.0.3)
rmarkdown * 2.8 2021-05-07 [1] CRAN (R 4.0.3)
rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.2)
rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.0.2)
Rttf2pt1 1.3.8 2020-01-10 [1] CRAN (R 4.0.2)
rvest 1.0.0 2021-03-09 [1] CRAN (R 4.0.3)
S4Vectors * 0.28.1 2020-12-09 [1] Bioconductor
scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.2)
sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.2)
stringi 1.6.2 2021-05-17 [1] CRAN (R 4.0.3)
stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2)
survival 3.2-11 2021-04-26 [1] CRAN (R 4.0.3)
svglite * 2.0.0 2021-02-20 [1] CRAN (R 4.0.3)
systemfonts 1.0.2 2021-05-11 [1] CRAN (R 4.0.3)
textshaping 0.3.4 2021-05-11 [1] CRAN (R 4.0.3)
tibble * 3.1.2 2021-05-16 [1] CRAN (R 4.0.3)
tidyr * 1.1.3 2021-03-03 [1] CRAN (R 4.0.3)
tidyselect 1.1.1 2021-04-30 [1] CRAN (R 4.0.3)
tidyverse * 1.3.1 2021-04-15 [1] CRAN (R 4.0.3)
utf8 1.2.1 2021-03-12 [1] CRAN (R 4.0.3)
vctrs 0.3.8 2021-04-29 [1] CRAN (R 4.0.3)
vegan * 2.5-7 2020-11-28 [1] CRAN (R 4.0.3)
viridisLite 0.4.0 2021-04-13 [1] CRAN (R 4.0.3)
webshot 0.5.2 2019-11-22 [1] CRAN (R 4.0.2)
withr 2.4.2 2021-04-18 [1] CRAN (R 4.0.3)
xfun 0.23 2021-05-15 [1] CRAN (R 4.0.3)
xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2)
XVector * 0.30.0 2020-10-27 [1] Bioconductor
yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2)
zlibbioc 1.36.0 2020-10-27 [1] Bioconductor
[1] /home/angel/R/library
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/libraryCallahan BJ, Sankaran K, Fukuyama JA et al. Bioconductor workflow for microbiome data analysis: From raw reads to community analyses. F1000Research 2016;5:1492.